import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install watermark
!pip install dtreeviz
# if we update existing module, we need to restart colab
!pip install -U scikit-learn
!pip install -U xgboost # dtreeviz needs 1.2
import numpy as np
import pandas as pd
import xgboost
import xgboost as xgb
import sklearn
from sklearn import metrics as skmetrics
import os
import joblib
# model eval
import graphviz
import dtreeviz
from dtreeviz.models.shadow_decision_tree import ShadowDecTree
from dtreeviz.models.xgb_decision_tree import ShadowXGBDTree
# params
SEED = 100
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2020-11-24 CPython 3.6.9 IPython 5.5.0 compiler : GCC 8.4.0 system : Linux release : 4.19.112+ machine : x86_64 processor : x86_64 CPU cores : 2 interpreter: 64bit pandas 1.1.4 sklearn 0.23.2 xgboost 1.2.1 watermark 2.0.2 graphviz 0.10.1 joblib 0.17.0 numpy 1.18.5
def adjustedR2(rsquared,nrows,ncols):
return rsquared- (ncols-1)/(nrows-ncols) * (1-rsquared)
def print_regr_eval(ytest,ypreds,ncols):
rmse = np.sqrt(skmetrics.mean_squared_error(ytest,ypreds))
r2 = skmetrics.r2_score(ytest,ypreds)
ar2 = adjustedR2(r2,len(ytest),ncols)
evs = skmetrics.explained_variance_score(ytest, ypreds)
print(f"""
RMSE : {rmse:,.2f}
Explained Variance: {evs:.6f}
R-Squared: {r2:,.6f}
Adjusted R-squared: {ar2:,.6f}
""")
def show_methods(obj, ncols=4):
lst = [i for i in dir(obj) if i[0]!='_' ]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
if ENV_COLAB:
path_git = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
project = 'Projects/King_County_Seattle_House_Price_Kaggle/'
data_path_parent = path_git + project
else:
data_path_parent = '../data/'
data_path_Xtrain = data_path_parent + 'processed/Xtrain.csv.zip'
data_path_ytrain = data_path_parent + 'processed/ytrain.csv'
data_path_Xtest = data_path_parent + 'processed/Xtest.csv.zip'
data_path_ytest = data_path_parent + 'processed/ytest.csv'
target = 'price'
train_size = 0.8
print(data_path_Xtest)
https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/processed/Xtest.csv.zip
df_Xtrain = pd.read_csv(data_path_Xtrain,compression='zip')
ser_ytrain = pd.read_csv(data_path_ytrain,header=None)
ytrain = np.array(ser_ytrain).flatten()
ytrain_log1p = np.log1p(ytrain)
df_Xtest = pd.read_csv(data_path_Xtest,compression='zip')
ser_ytest = pd.read_csv(data_path_ytest,header=None)
ytest = np.array(ser_ytest).flatten()
features = list(df_Xtest.columns)
s = f"""
df_Xtest = {df_Xtest.shape}
ytest = {ytest.shape}
"""
print(s)
display(df_Xtest.head(2))
display(ser_ytest.head(2))
assert df_Xtest.shape[0] == ytest.shape[0]
df_Xtest = (4323, 67) ytest = (4323,)
| age | age_after_renovation | age_after_renovation_cat | age_after_renovation_sq | age_cat | age_sq | basement_bool | bathrooms | bathrooms_sq | bedrooms | bedrooms_sq | condition | condition_1 | condition_2 | condition_3 | condition_4 | condition_5 | floors | floors_sq | grade | grade_10 | grade_11 | grade_12 | grade_13 | grade_4 | grade_5 | grade_6 | grade_7 | grade_8 | grade_9 | lat | log1p_sqft_above | log1p_sqft_above_sq | log1p_sqft_basement | log1p_sqft_basement_sq | log1p_sqft_living | log1p_sqft_living15 | log1p_sqft_living15_sq | log1p_sqft_living_sq | log1p_sqft_lot | log1p_sqft_lot15 | log1p_sqft_lot15_sq | log1p_sqft_lot_sq | long | renovation_bool | sqft_above | sqft_basement | sqft_living | sqft_living15 | sqft_lot | sqft_lot15 | view | view_0 | view_1 | view_2 | view_3 | view_4 | view_sq | waterfront | waterfront_0 | waterfront_1 | waterfront_sq | yr_built | yr_renovated | yr_renovated2 | yr_sales | zipcode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.372335 | -1.316486 | -1.265291 | -0.845091 | -1.320662 | -0.885667 | -0.801818 | 0.506258 | 0.326221 | -0.39033 | -0.30222 | -0.630613 | -0.035694 | -0.08937 | 0.735526 | -0.595921 | -0.294513 | 0.933474 | 0.806845 | -0.554878 | -0.238288 | -0.135782 | -0.066005 | -0.026354 | -0.036497 | -0.108453 | -0.324043 | 1.186907 | -0.624934 | -0.367371 | -1.504623 | 0.948862 | 0.938843 | -0.795545 | -0.779839 | 0.590708 | 0.791054 | 0.779280 | 0.57266 | -0.489502 | -0.638404 | -0.634197 | -0.500625 | 0.025916 | -0.207998 | 0.793435 | -0.658262 | 0.397588 | 0.659102 | -0.234915 | -0.293699 | -0.305512 | 0.329787 | -0.123077 | -0.217065 | -0.1533 | -0.124282 | -0.261712 | -0.089698 | 0.089698 | -0.089698 | -0.089698 | 1.361464 | -0.207992 | 1.305630 | -0.693043 | -1.422563 |
| 1 | -0.084817 | -0.005269 | -0.062185 | -0.285363 | -0.139825 | -0.348085 | -0.801818 | 0.506258 | 0.326221 | -0.39033 | -0.30222 | 0.902903 | -0.035694 | -0.08937 | -1.359572 | 1.678075 | -0.294513 | 0.933474 | 0.806845 | -0.554878 | -0.238288 | -0.135782 | -0.066005 | -0.026354 | -0.036497 | -0.108453 | -0.324043 | 1.186907 | -0.624934 | -0.367371 | -2.128172 | -0.094722 | -0.122423 | -0.795545 | -0.779839 | -0.460316 | -0.671771 | -0.680476 | -0.48267 | -0.564254 | 0.555413 | 0.499294 | -0.567770 | -0.504869 | -0.207998 | -0.272741 | -0.658262 | -0.563091 | -0.697681 | -0.242762 | -0.021608 | -0.305512 | 0.329787 | -0.123077 | -0.217065 | -0.1533 | -0.124282 | -0.261712 | -0.089698 | 0.089698 | -0.089698 | -0.089698 | 0.107715 | -0.207992 | 0.028586 | 1.442912 | -1.441324 |
| 0 | |
|---|---|
| 0 | 285000.0 |
| 1 | 239950.0 |
# saved models
if ENV_COLAB:
!mkdir -p ../models
!wget https://github.com/bhishanpdl/Datasets/blob/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump?raw=true
!mv model_xgb_logtarget.dump?raw=true ../models/model_xgb_logtarget.dump
!ls ../models
!du -sh ../models/model_xgb_logtarget.dump
path_model_xgb = '../models/model_xgb_logtarget.dump'
model = xgboost.XGBRegressor()
model.load_model(path_model_xgb)
ypreds_log1p = model.predict(df_Xtest)
ypreds = np.expm1(ypreds_log1p)
print('ytest:', ytest[:3])
print('ypreds: ', ypreds[:3]) # **WARNING: This gives empty ypreds**
--2020-11-24 00:29:14-- https://github.com/bhishanpdl/Datasets/blob/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump?raw=true Resolving github.com (github.com)... 52.192.72.89 Connecting to github.com (github.com)|52.192.72.89|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump [following] --2020-11-24 00:29:15-- https://github.com/bhishanpdl/Datasets/raw/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump Reusing existing connection to github.com:443. HTTP request sent, awaiting response... 302 Found Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump [following] --2020-11-24 00:29:15-- https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 821441 (802K) [application/octet-stream] Saving to: ‘model_xgb_logtarget.dump?raw=true’ model_xgb_logtarget 100%[===================>] 802.19K --.-KB/s in 0.06s 2020-11-24 00:29:15 (13.8 MB/s) - ‘model_xgb_logtarget.dump?raw=true’ saved [821441/821441] model_xgb_logtarget.dump model_xgb_logtarget.joblib 804K ../models/model_xgb_logtarget.dump ytest: [285000. 239950. 460000.] ypreds: [343218.4 204292.31 508420.8 ]
# saved models
use_joblib = 0
if use_joblib:
!mkdir -p ../models
!wget https://github.com/bhishanpdl/Datasets/blob/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump?raw=true
!mv model_xgb_logtarget.joblib?raw=true ../models/model_xgb_logtarget.joblib
!ls ../models
!du -sh ../models/model_xgb_logtarget.joblib
path_model_xgb = '../models/model_xgb_logtarget.joblib'
model = xgboost.XGBRegressor()
model.load_model(path_model_xgb)
ypreds_log1p = model.predict(df_Xtest)
ypreds = np.expm1(ypreds_log1p)
print('ytest:', ytest[:3])
print('ypreds: ', ypreds[:3]) # **WARNING: This gives empty ypreds**
%%time
train_model = 0
if train_model:
params_xgb = dict(n_jobs=-1,
random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=3, # default 6
reg_alpha=1, # default alpha = 0, alias reg_alpha
reg_lambda=5, # default lambda = 1, alias reg_lambda
subsample=1, # default 1
gamma=0, # default gamma=0 alias min_split_loss
min_child_weight=1, # default 1
colsample_bytree=1, # default 1
learning_rate=0.1, # default eta = 0.3
tree_method = 'auto', # default auto, use gpu_hist
)
model = xgboost.XGBRegressor(**params_xgb)
model.fit(df_Xtrain, ytrain_log1p)
ypreds_log1p = model.predict(df_Xtest)
ypreds = np.expm1(ypreds_log1p)
print('ytest:', ytest[:3])
print('ypreds: ', ypreds[:3])
print_regr_eval(ytest,ypreds,df_Xtest.shape[1])
CPU times: user 4 µs, sys: 1 µs, total: 5 µs Wall time: 7.63 µs
show_methods(model)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | apply | get_booster | max_delta_step | random_state |
| 1 | base_score | get_num_boosting_rounds | max_depth | reg_alpha |
| 2 | booster | get_params | min_child_weight | reg_lambda |
| 3 | coef_ | get_xgb_params | missing | save_model |
| 4 | colsample_bylevel | gpu_id | monotone_constraints | scale_pos_weight |
| 5 | colsample_bynode | importance_type | n_estimators | score |
| 6 | colsample_bytree | interaction_constraints | n_features_in_ | set_params |
| 7 | evals_result | intercept_ | n_jobs | subsample |
| 8 | feature_importances_ | kwargs | num_parallel_tree | tree_method |
| 9 | fit | learning_rate | objective | validate_parameters |
| 10 | gamma | load_model | predict | verbosity |
bst = model.get_booster()
bst.trees_to_dataframe().head(2)
| Tree | Node | ID | Feature | Split | Yes | No | Missing | Gain | Cover | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0-0 | grade | 0.716166 | 0-1 | 0-2 | 0-1 | 689.75 | 17290.0 |
| 1 | 0 | 1 | 0-1 | lat | -0.194376 | 0-3 | 0-4 | 0-3 | 105.75 | 13905.0 |
bst.trees_to_dataframe()['Tree'].nunique()
1200
bst.trees_to_dataframe().query("Tree == 0")
| Tree | Node | ID | Feature | Split | Yes | No | Missing | Gain | Cover | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0-0 | grade | 0.716166 | 0-1 | 0-2 | 0-1 | 689.750000 | 17290.0 |
| 1 | 0 | 1 | 0-1 | lat | -0.194376 | 0-3 | 0-4 | 0-3 | 105.750000 | 13905.0 |
| 2 | 0 | 2 | 0-2 | Leaf | NaN | NaN | NaN | NaN | 1.313649 | 3385.0 |
| 3 | 0 | 3 | 0-3 | Leaf | NaN | NaN | NaN | NaN | 1.208724 | 5749.0 |
| 4 | 0 | 4 | 0-4 | Leaf | NaN | NaN | NaN | NaN | 1.260315 | 8156.0 |
xgb.plot_tree(model)
<matplotlib.axes._subplots.AxesSubplot at 0x7f3907cc9e48>
ShadowXGBDTree(self,
booster:xgboost.core.Booster,
tree_index:int,
x_data,
y_data,
feature_names:List[str]=None,
target_name:str=None,
class_names:(typing.List[str],typing.Mapping[int, str])=None)
import graphviz
from dtreeviz import trees as dtrees
from dtreeviz.models.shadow_decision_tree import ShadowDecTree
from dtreeviz.models.xgb_decision_tree import ShadowXGBDTree
show_methods(dtrees)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | Color | adjust_colors | graphviz | rgb2hex |
| 1 | DTreeViz | class_leaf_viz | inline_svg_images | rtreeviz_bivar_3D |
| 2 | List | class_split_viz | myround | rtreeviz_bivar_heatmap |
| 3 | Mapping | ctreeviz_bivar | np | rtreeviz_univar |
| 4 | NUM_BINS | ctreeviz_leaf_samples | os | run |
| 5 | Number | ctreeviz_univar | patches | scale_SVG |
| 6 | PLATFORM | describe_node_sample | pd | tempfile |
| 7 | Path | draw_legend | plt | tree |
| 8 | ShadowDecTree | draw_piechart | prediction_path | view |
| 9 | ShadowDecTreeNode | dtreeviz | prop_size | viz_leaf_criterion |
| 10 | Tuple | explain_prediction_path | regr_leaf_viz | viz_leaf_samples |
| 11 | add_classifier_legend | get_num_bins | regr_split_viz | viz_leaf_target |
# help(ShadowXGBDTree)
bst_shadow = ShadowXGBDTree(bst, tree_index=1,
x_data=df_Xtrain, y_data=ytrain_log1p,
feature_names=features, target_name=target)
dtreeviz(tree_model,
x_data:(<class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>)=None,
y_data:(<class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>)=None,
feature_names:List[str]=None,
target_name:str=None,
class_names:(typing.Mapping[numbers.Number, str], typing.List[str])=None,
tree_index:int=None,
precision:int=2,
orientation:('TD', 'LR')='TD',
instance_orientation:('TD', 'LR')='LR',
show_root_edge_labels:bool=True,
show_node_labels:bool=False,
show_just_path:bool=False,
fancy:bool=True,
histtype:('bar', 'barstacked', 'strip')='barstacked',
highlight_path:List[int]=[],
X:numpy.ndarray=None,
max_X_features_LR:int=10,
max_X_features_TD:int=20,
label_fontsize:int=12,
ticks_fontsize:int=8,
fontname:str='Arial',
colors:dict=None,
scale=1.0) -> dtreeviz.trees.DTreeViz
# help(dtrees.dtreeviz)
dtrees.dtreeviz(bst_shadow)
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
viz_leaf_samples(tree_model,
x_data:(<class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>)=None,
feature_names:List[str]=None,
tree_index:int=None,
figsize:tuple=(10, 5),
display_type:str='plot',
colors:dict=None,
fontsize:int=14,
fontname:str='Arial',
grid:bool=False,
bins:int=10,
min_samples:int=0,
max_samples:int=None)
dtrees.viz_leaf_samples(bst, df_Xtrain,feature_names=features,tree_index=1)
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
describe_node_sample(tree_model,
node_id:int,
x_data:(<class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>)=None,
feature_names:List[str]=None,
tree_index:int=None)
dtrees.describe_node_sample(bst, node_id=1,
x_data=df_Xtrain,
feature_names=features,
tree_index=1).iloc[:,:5]
| age | age_after_renovation | age_after_renovation_cat | age_after_renovation_sq | age_cat | |
|---|---|---|---|---|---|
| count | 11685.000000 | 11685.000000 | 11685.000000 | 11685.000000 | 11685.000000 |
| mean | 0.181673 | 0.198875 | 0.195734 | 0.167634 | 0.178939 |
| std | 0.984658 | 0.995964 | 0.998632 | 1.036658 | 0.987304 |
| min | -1.507863 | -1.454509 | -1.265291 | -0.848104 | -1.320662 |
| 25% | -0.593048 | -0.591866 | -0.463220 | -0.655279 | -0.533437 |
| 50% | 0.152358 | 0.201766 | 0.338850 | -0.108606 | 0.253788 |
| 75% | 0.796117 | 0.857374 | 0.739886 | 0.610136 | 0.647401 |
| max | 2.422456 | 2.548154 | 2.344027 | 3.579173 | 2.221851 |
explain_prediction_path(tree_model,
x:numpy.ndarray,
x_data=None,
y_data=None,
explanation_type:('plain_english', 'sklearn_default')='plain_english',
feature_names:List[str]=None,
target_name:str=None,
class_names:(typing.Mapping[numbers.Number, str],
typing.List[str])=None,
tree_index:int=None)
row = df_Xtrain.iloc[10]
row.head()
age -0.694694 age_after_renovation -0.626372 age_after_renovation_cat -0.463220 age_after_renovation_sq -0.671013 age_cat -0.533437 Name: 10, dtype: float64
s = dtrees.explain_prediction_path(bst_shadow, row,
explanation_type="plain_english",tree_index=9)
print(s)
lat < -0.21 log1p_sqft_living < 0.48
viz_leaf_target(tree_model,
x_data:(<class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>)=None,
y_data:(<class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>)=None,
feature_names:List[str]=None,
target_name:str=None,
tree_index:int=None,
show_leaf_labels:bool=True,
colors:dict=None,
markersize:int=50,
label_fontsize:int=14,
fontname:str='Arial',
precision:int=1,
figsize:tuple=None,
grid:bool=False,
prediction_line_width:int=2)
dtrees.viz_leaf_target(bst, df_Xtrain, ytrain_log1p,
feature_names=features,
target_name=target,
tree_index=1)
dtrees.viz_leaf_target(bst_shadow)
# features_reg_univar = ["age"]
# target_reg_univar = "price"
# dtrain_reg_univar = xgb.DMatrix(df_Xtrain[features_reg_univar], ytrain_log1p)
# params_reg_univar = {"max_depth":3,
# "eta":0.05,
# "objective":"reg:squarederror",
# "subsample":1}
# xgb_model_reg_univar = xgb.train(params=params_reg_univar,
# dtrain=dtrain_reg_univar,
# num_boost_round=8)
# xgb_shadow_reg_univar = ShadowXGBDTree(xgb_model_reg_univar, 1,
# df_Xtrain[features_reg_univar], ytrain_log1p,
# features_reg_univar, target_reg_univar)
# dtrees.rtreeviz_univar(xgb_shadow_reg_univar,
# df_Xtrain[features_reg_univar],ytrain_log1p,
# features_reg_univar, target_reg_univar)
err = """
VisualisationNotYetSupportedError: get_min_samples_leaf() is not implemented yet for XGBoost
"""
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 38 secs